{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1304acf4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.554569Z",
     "start_time": "2023-11-01T01:39:03.182135Z"
    }
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn \n",
    "from d2l import torch as d2l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5af53ad5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.558158Z",
     "start_time": "2023-11-01T01:39:04.555906Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_tokens_and_segments(tokens_a, tokens_b=None):\n",
    "    \"\"\"获取输入序列的词元及其片段索引\"\"\"\n",
    "    tokens = ['<cls>'] + tokens_a + ['<sep>']\n",
    "    # 0和1分别标记片段A和B\n",
    "    segments = [0] * (len(tokens_a) + 2)\n",
    "    if tokens_b is not None:\n",
    "        tokens += tokens_b + ['<sep>']\n",
    "        segments += [1] * (len(tokens_b) + 1)\n",
    "    return tokens, segments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a87c7001",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.562373Z",
     "start_time": "2023-11-01T01:39:04.559118Z"
    }
   },
   "outputs": [],
   "source": [
    "class BERTEncoder(nn.Module):\n",
    "    \"\"\"BERT Encoder\"\"\"\n",
    "    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input,\n",
    "                 ffn_num_hiddens, num_heads, num_layers, dropout, \n",
    "                 maxlen=1000, key_size=768, query_size=768, value_size=768, **kwargs):\n",
    "        super(BERTEncoder, self).__init__(**kwargs)\n",
    "        self.token_embeddinmg = nn.Embedding(vocab_size, num_hiddens)\n",
    "        self.segment_embedding = nn.Embedding(2, num_hiddens)\n",
    "        self.blks = nn.Sequential()\n",
    "        for i in range(num_layers):\n",
    "            self.blks.add_module(f\"{i}\", d2l.EncoderBlock(\n",
    "            key_size, query_size, value_size, num_hiddens, norm_shape,\n",
    "            ffn_num_input, ffn_num_hiddens, num_heads, dropout, True))\n",
    "        # 在BERT中，位置嵌入是可学习的，因此我们创建一个足够长的位置嵌入参数\n",
    "        self.pos_embedding = nn.Parameter(torch.randn((1, maxlen,num_hiddens)))\n",
    "        \n",
    "    def forward(self, tokens, segments, valid_lens):\n",
    "        X = self.token_embeddinmg(tokens) + self.segment_embedding(segments)\n",
    "        X = X + self.pos_embedding.data[:, :X.shape[1], :]\n",
    "        for blk in self.blks:\n",
    "            X = blk(X, valid_lens)\n",
    "        return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d31b60f9",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.675165Z",
     "start_time": "2023-11-01T01:39:04.563689Z"
    }
   },
   "outputs": [],
   "source": [
    "vocab_size, num_hiddens, ffn_num_hiddens, num_heads = 10000, 768, 1024, 4\n",
    "norm_shape, ffn_num_input, num_layers, dropout = [768], 768, 2, 0.2\n",
    "encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input,\n",
    "                      ffn_num_hiddens, num_heads, num_layers, dropout)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fc4d916c",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.689208Z",
     "start_time": "2023-11-01T01:39:04.676724Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 8, 768])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens = torch.randint(0, vocab_size, (2, 8))\n",
    "segments = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1]])\n",
    "encoded_X = encoder(tokens, segments, None)\n",
    "encoded_X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d884adff",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.693305Z",
     "start_time": "2023-11-01T01:39:04.690344Z"
    }
   },
   "outputs": [],
   "source": [
    "class MaskLM(nn.Module):\n",
    "    \"\"\"BERT的掩蔽语言模型任务\"\"\"\n",
    "    def __init__(self, vocab_size, num_hiddens, num_inputs=768, **kwargs):\n",
    "        super(MaskLM, self).__init__(**kwargs)\n",
    "        # # 创建一个具有num_hiddens个输入特征的LayerNorm层  \n",
    "        self.mlp = nn.Sequential(nn.Linear(num_inputs, num_hiddens),\n",
    "                                 nn.ReLU(),\n",
    "                                 nn.LayerNorm(num_hiddens),\n",
    "                                 nn.Linear(num_hiddens, vocab_size))\n",
    "\n",
    "    def forward(self, X, pred_positions):\n",
    "        num_pred_positions = pred_positions.shape[1]\n",
    "        pred_positions = pred_positions.reshape(-1)\n",
    "        batch_size = X.shape[0]\n",
    "        batch_idx = torch.arange(0, batch_size)\n",
    "        # 假设batch_size=2，num_pred_positions=3\n",
    "        # 那么batch_idx是np.array（[0,0,0,1,1,1]）\n",
    "        batch_idx = torch.repeat_interleave(batch_idx, num_pred_positions)\n",
    "        masked_X = X[batch_idx, pred_positions]\n",
    "        masked_X = masked_X.reshape((batch_size, num_pred_positions, -1))\n",
    "        mlm_Y_hat = self.mlp(masked_X)\n",
    "        return mlm_Y_hat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8af86ff0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.722037Z",
     "start_time": "2023-11-01T01:39:04.694371Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 3, 10000])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlm = MaskLM(vocab_size, num_hiddens)\n",
    "mlm_positions = torch.tensor([[1, 5, 2], [6, 1, 5]])\n",
    "mlm_Y_hat = mlm(encoded_X, mlm_positions)\n",
    "mlm_Y_hat.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f6bdb869",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.726371Z",
     "start_time": "2023-11-01T01:39:04.723224Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([6])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlm_Y = torch.tensor([[7, 8, 9], [10, 20, 30]])\n",
    "loss = nn.CrossEntropyLoss(reduction='none')\n",
    "mlm_l = loss(mlm_Y_hat.reshape((-1, vocab_size)), mlm_Y.reshape(-1))\n",
    "mlm_l.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dc51e597",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.729533Z",
     "start_time": "2023-11-01T01:39:04.727266Z"
    }
   },
   "outputs": [],
   "source": [
    "class NextSentencePred(nn.Module):\n",
    "    \"\"\"BERT的下一句预测任务\"\"\"\n",
    "    def __init__(self, num_inputs, **kwargs):\n",
    "        super(NextSentencePred, self).__init__(**kwargs)\n",
    "        self.output = nn.Linear(num_inputs, 2)\n",
    "\n",
    "    def forward(self, X):\n",
    "        # X的形状：(batchsize,num_hiddens)\n",
    "        return self.output(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c41e0806",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.734859Z",
     "start_time": "2023-11-01T01:39:04.731684Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 2])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoded_X = torch.flatten(encoded_X, start_dim=1)\n",
    "# NSP的输入形状:(batchsize，num_hiddens)\n",
    "nsp = NextSentencePred(encoded_X.shape[-1])\n",
    "nsp_Y_hat = nsp(encoded_X)\n",
    "nsp_Y_hat.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6214cefc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.738642Z",
     "start_time": "2023-11-01T01:39:04.735820Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nsp_y = torch.tensor([0, 1])\n",
    "nsp_l = loss(nsp_Y_hat, nsp_y)\n",
    "nsp_l.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fa18e77e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-01T01:39:04.743155Z",
     "start_time": "2023-11-01T01:39:04.739881Z"
    }
   },
   "outputs": [],
   "source": [
    "#@save\n",
    "class BERTModel(nn.Module):\n",
    "    \"\"\"BERT模型\"\"\"\n",
    "    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input,\n",
    "                 ffn_num_hiddens, num_heads, num_layers, dropout,\n",
    "                 max_len=1000, key_size=768, query_size=768, value_size=768,\n",
    "                 hid_in_features=768, mlm_in_features=768,\n",
    "                 nsp_in_features=768):\n",
    "        super(BERTModel, self).__init__()\n",
    "        self.encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape,\n",
    "                    ffn_num_input, ffn_num_hiddens, num_heads, num_layers,\n",
    "                    dropout, max_len=max_len, key_size=key_size,\n",
    "                    query_size=query_size, value_size=value_size)\n",
    "        self.hidden = nn.Sequential(nn.Linear(hid_in_features, num_hiddens),\n",
    "                                    nn.Tanh())\n",
    "        self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features)\n",
    "        self.nsp = NextSentencePred(nsp_in_features)\n",
    "\n",
    "    def forward(self, tokens, segments, valid_lens=None,\n",
    "                pred_positions=None):\n",
    "        encoded_X = self.encoder(tokens, segments, valid_lens)\n",
    "        if pred_positions is not None:\n",
    "            mlm_Y_hat = self.mlm(encoded_X, pred_positions)\n",
    "        else:\n",
    "            mlm_Y_hat = None\n",
    "        # 用于下一句预测的多层感知机分类器的隐藏层，0是“<cls>”标记的索引\n",
    "        nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :]))\n",
    "        return encoded_X, mlm_Y_hat, nsp_Y_hat"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python(d2l)",
   "language": "python",
   "name": "d2l"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
